Shortcuts:
| Insert a code chunk | Run | Knit | -> | %>% |
|---|---|---|---|---|
| cmd+alt+i | cmd+shift+enter | cmd+shift+k | alt+- | ctrl+shift+m |
library(forecast)
library(tsDyn)
library(zoo)
library(MASS)
library(reshape2)
library(reshape)
library(ggplot2)
library(dplyr)
library(gapminder)
library(smooth)
setwd("/Users/aubrey/Documents/GitHub/Master-s_Thesis")
unrate_data <- read.table("datasets/text_data/unrate.txt", header = TRUE,
col.names=c("time", "unemployment"), sep=",")
head(unrate_data)
## time unemployment
## 1 Q1-47 NaN
## 2 Q2-47 NaN
## 3 Q3-47 NaN
## 4 Q4-47 NaN
## 5 Q1-48 4.0
## 6 Q2-48 3.6
– Typical AR(2) model at first glance:
print(Acf(unrate_data["unemployment"]))
##
## Autocorrelations of series 'unrate_data["unemployment"]', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 1.000 0.951 0.870 0.772 0.675 0.598 0.536 0.483 0.436 0.401 0.367 0.337 0.313
## 13 14 15 16 17 18 19 20 21 22 23
## 0.292 0.279 0.273 0.271 0.268 0.256 0.237 0.211 0.184 0.160 0.139
print(Pacf(unrate_data["unemployment"]))
##
## Partial autocorrelations of series 'unrate_data["unemployment"]', by lag
##
## 1 2 3 4 5 6 7 8 9 10 11
## 0.951 -0.356 -0.124 0.025 0.162 0.008 -0.069 -0.022 0.140 -0.080 0.008
## 12 13 14 15 16 17 18 19 20 21 22
## 0.024 0.040 0.060 0.007 0.006 -0.016 -0.072 -0.016 -0.016 0.015 0.018
## 23
## -0.024
But the result of fitting auto.arima(unrate_ts_nona) points to ARIMA(1,1,2)(2,0,1)[4]
It doesn’t matter whether the model fit the data, because I aim to simulate linear time series and nonlinear time series. I only utilize the unemployment data as a base to embed more realistic informtion on the parameters. The simulated data is semi-artificial.
– Transform dataframe to time series object:
# Convert "time" column to Date format
unrate_data$time <- as.Date(paste0(unrate_data$time, "-01"), format = "%b-%y-%d")
# Create a time series object
unrate_ts <- ts(unrate_data$unemployment, frequency = 4, start = c(1947, 1))
# only 1947 has nan value, delete it
unrate_ts_nona <- na.omit(unrate_ts)
# Print the transformed time series
head(unrate_ts_nona)
## Qtr1 Qtr2 Qtr3 Qtr4
## 1948 4.0 3.6 3.8 4.0
## 1949 5.0 6.2
plot(unrate_ts_nona)
#
# Create an empty dataframe to store scenarios
scenario_df <- data.frame()
# Define the time series lengths
time_series_lengths <- c(60, 222)
#
# Define the amounts of time series
amount_of_time_series <- c(10, 101, 500)
#
# Define the type of intervention
te_intervention <- c("homogeneous", "heterogeneous")
# Define the proportion of control and treated units
proportion_control_treated <- 0.5
# Define the DGPs (Linear and Nonlinear)
dgps <- c("linear", "nonlinear")
# Loop through scenarios
for (length in time_series_lengths) {
for (amount in amount_of_time_series) {
for (dgp in dgps) {
for (te in te_intervention){
# Fit DGP's model to the UNRATE data
if (dgp == "linear") {
model <- auto.arima(unrate_ts_nona)
# print(summary(model))
} else if (dgp == "nonlinear") {
model <- selectSETAR(unrate_ts_nona, m=1)
# manually record the data
model <- setar(unrate_ts_nona, m=1,
thDelay=model$bests["thDelay"], th=model$bests["th"])
# model$coefficients
B <- model$coefficients[1:(length(model$coefficients)-1)]
th <- model$coefficients["th"]
}
synthetic_data_list <- list() # Store synthetic data for this scenario
# Generate synthetic data for each time series in this scenario
for (n in 1:amount) {
# Simulate time series
temp <- TRUE
while(temp){
if (dgp == "linear") {
synthetic_data <- simulate(model, nsim = length, future=F)
} else {
synthetic_data <- setar.sim(B=B, n=length,lag=1,Thresh=th,
include=model$include, setarObject=model,type="simul",
starting=unrate_ts_nona[1])
}
# constraints
if((any(synthetic_data<0))||(any(synthetic_data>15))){
temp <- TRUE
}else{
temp <- FALSE
}
}
# # normalize to 0 mean and unit variance
# synthetic_data <- scale(synthetic_data)
# # First, the series are normalised to zero mean and unit variance. Then, to make
# # the scenarios more realistic we make all the data non-negative by subtracting from every series its
# # smallest value if that is negative. Since according to Hyndman and Koehler (2006) the error measures
# # associated with the study such as Symmetric Mean Absolute Percentage Error (SMAPE) are prone to
# # issues with values close to zero, as the next step we add one to the whole series if the minimum value
# # is less than one.
# min <- min(synthetic_data)
# if (min < 0){
# synthetic_data <- synthetic_data - min
# }
# if (min < 1){
# synthetic_data <- synthetic_data + 1
# }
# synthetic_data <- t(synthetic_data)
synthetic_data_list[[n]] <- synthetic_data
}
# Combine synthetic data into a single dataframe
synthetic_data_df <- do.call(rbind, synthetic_data_list)
# synthetic_data_df <- cbind(c(1:amount), synthetic_data_df)
# Divide units into control and treated
num_control <- floor(amount * proportion_control_treated)
control_units <- synthetic_data_df[1:num_control, ]
treated_units <- synthetic_data_df[(num_control + 1):amount, ]
# Introduce homogeneous or heterogeneous interventions
## replicate the dataframe
# treated_units <- cbind.data.frame(rep(te_intervention, each=num_control),
# treated_units[rep(seq_len(nrow(treated_units)), times = 2),])
# Intervention at T0 = 49 or 211, prediction range is 12
quantile_90 <- quantile(unlist(treated_units[ , (length-11):length]), 0.9)
if (te == "homogeneous"){
treated_units[ , (length-11):length][treated_units[ ,
(length-11):length] > quantile_90] <- treated_units[ ,
(length-11):length][treated_units[ , (length-11):length] >
quantile_90] + sd(unlist(treated_units[ , 1:(length-10)]))
# Add one sd
}else{
treated_units[ , (length-11):length][treated_units[ ,
(length-11):length] > quantile_90] <- treated_units[ ,
(length-11):length][treated_units[ , (length-11):length] >
quantile_90] + runif(1, 0.7, 1.5) * sd(unlist(treated_units[ ,
1:(length-10)])) # Add r * sd
}
# Melt the groups
control_units_melted <- melt(control_units)
colnames(control_units_melted) <- c("series_id", "time", "value")
control_units_melted["c_t"] <- "control"
treated_units_melted <- melt(treated_units)
colnames(treated_units_melted) <- c("series_id", "time", "value")
treated_units_melted["c_t"] <- "treated"
treated_units_melted["series_id"] <- treated_units_melted["series_id"] + num_control
# Create a scenario row
scenario_row <- rbind(control_units_melted, treated_units_melted)
scenario_row["time_series_length"] <- length
scenario_row["amount_of_time_series"] <- amount
scenario_row["dgp"] <- dgp
scenario_row["te_intervention"] <- te
scenario_row["series_id"] <- with(scenario_row,
paste0(amount_of_time_series, "_",
series_id, "_", time_series_length, "_",
dgp, "_", te_intervention))
# Add the scenario row to the dataframe
scenario_df <- rbind(scenario_df, scenario_row)
}
}
}
}
– Read the result from simulation
setwd("/Users/aubrey/Documents/GitHub/Master-s_Thesis")
# save scenario_df
# write.csv(scenario_df, "datasets/text_data/scenario_df.csv", row.names = FALSE)
scenario_df <- read.csv("datasets/text_data/scenario_df.csv")
– Check there are 24 simulated scenarios and 4888*2 TSs
print(nrow(unique(scenario_df[c("time_series_length",
"amount_of_time_series", "dgp", "te_intervention")])))
## [1] 24
print(nrow(unique(scenario_df[c("series_id")])))
## [1] 4888
# Calculate an average
example <- scenario_df
example["average_or_not"] <- "individual"
# average of all time series
example_average <- example %>%
group_by(time) %>%
summarize(value = mean(value))
# average of all treated time series after 49
example_average1 <- example %>%
filter(c_t == "treated" & time_series_length == 60 & time >= 49) %>%
group_by(time) %>%
summarize(value = mean(value))
# average of all treated time series after 211
example_average2 <- example %>%
filter(c_t == "treated" & time_series_length == 222 & time >= 211) %>%
group_by(time) %>%
summarize(value = mean(value))
bind_average <- function(a, c_t, type, time_series_length){
a["average_or_not"] <- type
a["c_t"] <- c_t
a["amount_of_time_series"] <- 0
a["time_series_length"] <- time_series_length
a["dgp"] <- "average"
a["te_intervention"] <- "average"
a["series_id"] <- with(a,
paste0(amount_of_time_series, "_",
time_series_length, "_",
dgp, "_", te_intervention))
a <- as.data.frame(a)
a <- a[c("series_id", "time", "value", "c_t",
"time_series_length", "amount_of_time_series", "dgp",
"te_intervention", "average_or_not")]
return(a)
}
# example_add_average <- rbind(example, example_average)
example_add_average <- rbind(example, bind_average(example_average,
"average", "average_all", 222))
example_add_average <- rbind(example_add_average,
bind_average(example_average1,"treated",
"average_treated_60", 60))
example_add_average <- rbind(example_add_average,
bind_average(example_average2,"treated",
"average_treated_222", 222))
# Plotting
plot <- ggplot(data = example_add_average,
aes(x = time, y = value, group = series_id)) +
geom_line(aes(color = average_or_not, alpha = average_or_not)) +
scale_linewidth(range = 0.01) +
scale_color_manual(values = c(average_all = "red",
average_treated_60 = "black", average_treated_222 = "black",
individual = "darkgrey")) +
scale_alpha_manual(values = c(average_all = 1,
average_treated_60 = 1, average_treated_222 = 1,
individual = 0.02))+
labs(x = "Time", y = "Unemployment rate") +
theme(aspect.ratio=2/5)
plot
# Save the plot
# ggsave(file = paste("Figure2_a.pdf"),
# plot = plot, width = 8, height = 5, dpi = 300)
# pre-intervention
pre_intervention <- quantile(scenario_df[(scenario_df$time<
(scenario_df$time_series_length-11)),
"value"], probs = seq(0, 1, 1/1000))
# post-intervention
post_intervention <- quantile(scenario_df[(scenario_df$c_t=="treated")&
(scenario_df$time>=(scenario_df$time_series_length-11)) ,
"value"], probs = seq(0, 1, 1/1000))
# true counterfactual
true_counterfactual <- quantile(scenario_df[(scenario_df$c_t=="control")&
(scenario_df$time>=(scenario_df$time_series_length-11)) ,
"value"], probs = seq(0, 1, 1/1000))
quantile_data <- data.frame(quantiles=as.numeric(gsub("%", "", names(pre_intervention))),
pre_intervention=unname(pre_intervention),
post_intervention=unname(post_intervention),
true_counterfactual=unname(true_counterfactual),
stringsAsFactors=FALSE)
quantile_data_melted <- melt(quantile_data, id.vars="quantiles", variable_name="type")
str(quantile_data_melted)
## 'data.frame': 3003 obs. of 3 variables:
## $ quantiles: num 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 ...
## $ type : Factor w/ 3 levels "pre_intervention",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 0.000755 0.303936 0.478466 0.595044 0.69858 ...
quantile_plot <- ggplot(data = quantile_data_melted,
aes(x=quantiles, y=value)) +
geom_line(aes(color = type)) +
scale_color_manual(values = c(pre_intervention = "red",
post_intervention = "blue", true_counterfactual = "green"))+
geom_vline(xintercept=90, color = "purple", linetype = "dashed")+
labs(x = "Quantiles", y = "Unemployment rate") +
theme_minimal() +
theme(aspect.ratio=2/5)
quantile_plot
# Save the plot
# ggsave(file = paste("Figure2_b_1.pdf"), plot=quantile_plot,
# width = 8, height = 5, dpi = 300)
# The formular
# pre-intervention
pre_intervention_f <- quantile(scenario_df[(scenario_df$time<
(scenario_df$time_series_length-11)),
"value"], probs = c(0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,
0.8,0.9,0.95,1))
# post-intervention
post_intervention_f <- quantile(scenario_df[(scenario_df$c_t=="treated")&
(scenario_df$time>=(scenario_df$time_series_length-11)) ,
"value"], probs = c(0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,
0.8,0.9,0.95,1))
quantile_data_f <- data.frame(quantiles=names(pre_intervention_f),
pre_intervention=unname(pre_intervention_f),
post_intervention=unname(post_intervention_f),
Difference=(post_intervention_f-pre_intervention_f)/pre_intervention_f*100,
stringsAsFactors=FALSE)
quantile_data_f %>%
mutate(across(2:4, round, digits=2))
## quantiles pre_intervention post_intervention Difference
## 0% 0% 0.00 0.01 1152.43
## 5% 5% 2.28 1.94 -15.00
## 10% 10% 3.03 2.74 -9.61
## 20% 20% 4.00 3.85 -3.78
## 30% 30% 4.76 4.75 -0.29
## 40% 40% 5.47 5.58 1.99
## 50% 50% 6.18 6.39 3.40
## 60% 60% 6.92 7.21 4.15
## 70% 70% 7.75 8.11 4.75
## 80% 80% 8.73 9.18 5.14
## 90% 90% 10.10 11.32 12.05
## 95% 95% 11.16 14.55 30.32
## 100% 100% 15.00 18.61 24.08
I only imitate MS-Hom-Short/Long from their repository: https://drive.google.com/file/d/1LDCEJzOmbmTN3wZXfKUyK8cgInbWaONM/view?usp=sharing because the I aim to create multiple time series, each is generated by the same DGP, and I only introduce homogeneous or heterogeneous effect in intervention.
set.seed(1)
setwd("/Users/aubrey/Documents/GitHub/Master-s_Thesis/Simulation")
source("./ar_coefficients_generator.R")
# Create an empty dataframe to store scenarios
scenario_df2 <- data.frame()
# Define the time series lengths
time_series_lengths <- c(60, 222)
# Define the type of DGPs, first two are linear, last two are nonlinear
dgps <- c("AR(3)", "SAR(1)", "SETAR", "CLM")
# Define the amounts of time series
amount_of_time_series <- c(10, 101, 500)
#
# Define the type of intervention
te_intervention <- c("homogeneous", "heterogeneous")
# Define the proportion of control and treated units
proportion_control_treated <- 0.5
# # Define the DGPs (Linear and Nonlinear)
# dgps <- c("linear", "nonlinear")
# Loop through scenarios
for (length in time_series_lengths) {
for (amount in amount_of_time_series) {
for (dgp in dgps) {
for (te in te_intervention){
synthetic_data_list <- list() # Store synthetic data for this scenario
# Generate synthetic data for each time series in this scenario
for (n in 1:amount) {
# Simulate time series
if (dgp == "AR(3)") {
lags <- 3 # for AR(3) process
maxRoot <- 5 # for a non exploding process
# Since the DGP takes some time to reach stability in its generated
# values, to avoid issues from initial values of the DGP,
# a burn-in period of 100 points is cut o  from the beginning of the series.
burn_in <- 100
parameters <- generate_random_arma_parameters(lags, maxRoot)
synthetic_data <- arima.sim(model=list(ar=parameters), n=length, n.start = burn_in)
# normalize to 0 mean and unit variance
synthetic_data <- scale(synthetic_data)
min <- min(synthetic_data)
if (min < 0){
synthetic_data <- synthetic_data - min
}
if (min < 1){
synthetic_data <- synthetic_data + 1
}
synthetic_data <- t(synthetic_data)
}
if (dgp == "SAR(1)"){
# The USAccDeaths series contains the monthly totals of accidental
# deaths in the USA for the period from the year 1973 to 1978,
# with 72 data points
seasonal_arima_mod <- Arima(USAccDeaths, seasonal=c(1,0,0))
ts <- simulate(seasonal_arima_mod, nsim=length + burn_in,
seed=(n*(n-1) + amount))
synthetic_data <- ts[(length(ts) - length + 1) : length(ts)]
# normalize to 0 mean and unit variance
synthetic_data <- scale(synthetic_data)
min <- min(synthetic_data)
if (min < 0){
synthetic_data <- synthetic_data - min
}
if (min < 1){
synthetic_data <- synthetic_data + 1
}
synthetic_data <- t(synthetic_data)
}
if (dgp == "SETAR"){
# use the AR coefficients and the threshold values presented
# in the example of the tsDyn package
# setar related variables
lags <- 2
no_of_thresholds <- 1
threshold_val <- 2
coefficient_matrix <- c(2.9,-0.4,-0.1,-1.5, 0.2,0.3)
starting_values <- c(2.8, 2.2)
coefficient_matrix <- coefficient_matrix + rnorm(n=1, mean=0, sd=0.007)
synthetic_data <- setar.sim(B=coefficient_matrix, lag=lags, type="simul",
n = length, nthresh=no_of_thresholds, Thresh=threshold_val, starting=starting_values)
# normalize to 0 mean and unit variance
synthetic_data <- scale(synthetic_data)
min <- min(synthetic_data)
if (min < 0){
synthetic_data <- synthetic_data - min
}
if (min < 1){
synthetic_data <- synthetic_data + 1
}
synthetic_data <- t(synthetic_data)
}
if (dgp == "CLM"){
burn_in <- 40
initial_value <- 0.5
coefficient <- 3.6
ts <- numeric(length + burn_in)
ts[1] <- initial_value
for (i in 2:(length + burn_in)){
noise <- rnorm(1)
ts[i] <- max(coefficient*ts[i-1]*(1-ts[i-1]) + noise/10 , 0)
}
synthetic_data <- ts[(burn_in + 1):length(ts)]
}
synthetic_data_list[[n]] <- synthetic_data
}
# Combine synthetic data into a single dataframe
synthetic_data_df <- do.call(rbind, synthetic_data_list)
# synthetic_data_df <- cbind(c(1:amount), synthetic_data_df)
# Divide units into control and treated
num_control <- floor(amount * proportion_control_treated)
control_units <- synthetic_data_df[1:num_control, ]
treated_units <- synthetic_data_df[(num_control + 1):amount, ]
# Introduce homogeneous or heterogeneous interventions
# Intervention at T0 = 49 or 211, prediction range is 12
quantile_90 <- quantile(unlist(treated_units[ , (length-11):length]), 0.9)
if (te == "homogeneous"){
treated_units[ , (length-11):length][treated_units[ ,
(length-11):length] > quantile_90] <- treated_units[ ,
(length-11):length][treated_units[ , (length-11):length] >
quantile_90] + sd(unlist(treated_units[ , 1:(length-10)]))
# Add one sd
}
else{
treated_units[ , (length-11):length][treated_units[ ,
(length-11):length] > quantile_90] <- treated_units[ ,
(length-11):length][treated_units[ , (length-11):length] >
quantile_90] + runif(1, 0.7, 1.5) * sd(unlist(treated_units[ ,
1:(length-10)])) # Add r * sd
}
# Melt the groups
control_units_melted <- melt(control_units)
colnames(control_units_melted) <- c("series_id", "time", "value")
control_units_melted["c_t"] <- "control"
treated_units_melted <- melt(treated_units)
colnames(treated_units_melted) <- c("series_id", "time", "value")
treated_units_melted["c_t"] <- "treated"
# Create a scenario row
scenario_row <- rbind(control_units_melted, treated_units_melted)
scenario_row["time_series_length"] <- length
scenario_row["amount_of_time_series"] <- amount
scenario_row["dgp"] <- dgp
scenario_row["te_intervention"] <- te
scenario_row["series_id"] <- with(scenario_row,
paste0(c_t, "_", amount_of_time_series, "_",
series_id, "_", time_series_length, "_",
dgp, "_", te_intervention))
# Add the scenario row to the dataframe
scenario_df2 <- rbind(scenario_df2, scenario_row)
}
}
}
}
– Read the result from simulation
setwd("/Users/aubrey/Documents/GitHub/Master-s_Thesis")
# save scenario_df
# write.csv(scenario_df2, "datasets/text_data/scenario_df_alternative.csv", row.names = FALSE)
scenario_df2 <- read.csv("datasets/text_data/scenario_df_alternative.csv")
– Check there are 48 simulated scenarios
print(nrow(unique(scenario_df2[c("time_series_length",
"amount_of_time_series", "dgp", "te_intervention")])))
## [1] 48
print(nrow(unique(scenario_df2[c("series_id")])))
## [1] 9776
– Check different dgps with time_series_lengths as 60 and amount_of_time_series as 10 — AR(3)
We use an AR(3) DGP to simulate simple linear patterns in the time series data We follow the procedure of Bergmeir et al. (2012) to randomly produce the 3 roots of the characteristic polynomial first and then generate the AR coefficients based on those.
plot_AR3 <- scenario_df2 %>%
filter(amount_of_time_series == 10 &
time_series_length == 60 & dgp == 'AR(3)') %>%
ggplot(aes(x = time, y = value, group = series_id)) +
geom_line(aes(color = c_t, alpha = te_intervention)) +
scale_color_manual(values = c("treated" = "red",
"control" = "black")) +
scale_alpha_manual(values = c("homogeneous" = 0.5,
"heterogeneous" = 1)) +
labs(title = "AR(3)",x = "Time", y = "Value") +
theme(aspect.ratio=2/5)
plot_AR3
— SAR(1) SAR DGP can simulate time series having a seasonality of a particular periodicity which are also very commonly seen in real-world scenarios. We first fit an SAR model of order 1 to the USAccDeaths monthly series of the datasets package (R Core Team, 2020) available in the R core libraries
plot_SAR1<- scenario_df2 %>%
filter(amount_of_time_series == 10 &
time_series_length == 60 & dgp == 'SAR(1)') %>%
ggplot(aes(x = time, y = value, group = series_id)) +
geom_line(aes(color = c_t, alpha = te_intervention)) +
scale_color_manual(values = c("treated" = "red",
"control" = "black")) +
scale_alpha_manual(values = c("homogeneous" = 0.5,
"heterogeneous" = 1)) +
labs(title = "SAR1",x = "Time", y = "Value") +
theme(aspect.ratio=2/5)
plot_SAR1
— SETAR The TAR model involves \(k_1\) number of threshold values, which separate the space into k regimes, where each one is modelled by a different AR process of order p.
We use the AR coefficients and the threshold values presented in the example of the tsDyn package
plot_SETAR<- scenario_df2 %>%
filter(amount_of_time_series == 10 &
time_series_length == 60 & dgp == 'SETAR') %>%
ggplot(aes(x = time, y = value, group = series_id)) +
geom_line(aes(color = c_t, alpha = te_intervention)) +
scale_color_manual(values = c("treated" = "red",
"control" = "black")) +
scale_alpha_manual(values = c("homogeneous" = 0.5,
"heterogeneous" = 1)) +
labs(title = "SETAR",x = "Time", y = "Value") +
theme(aspect.ratio=2/5)
plot_SETAR
— CLM Chaotic Logistic Map is a zero-bounded chaotic process
For all homogeneous cases, we make the simulated time series difficult to model by forecasting techniques
plot_CLM<- scenario_df2 %>%
filter(amount_of_time_series == 10 &
time_series_length == 60 & dgp == 'CLM') %>%
ggplot(aes(x = time, y = value, group = series_id)) +
geom_line(aes(color = c_t, alpha = te_intervention)) +
scale_color_manual(values = c("treated" = "red",
"control" = "black")) +
scale_alpha_manual(values = c("homogeneous" = 0.5,
"heterogeneous" = 1)) +
labs(title = "CLM",x = "Time", y = "Value") +
theme(aspect.ratio=2/5)
plot_CLM
# Calculate an average
example <- scenario_df2
example["average_or_not"] <- "individual"
# average of all time series
example_average <- example %>%
group_by(time) %>%
summarize(value = mean(value))
# average of all treated time series after 49
example_average1 <- example %>%
filter(c_t == "treated" & time_series_length == 60 & time >= 49) %>%
group_by(time) %>%
summarize(value = mean(value))
# average of all treated time series after 211
example_average2 <- example %>%
filter(c_t == "treated" & time_series_length == 222 & time >= 211) %>%
group_by(time) %>%
summarize(value = mean(value))
bind_average <- function(a, c_t, type, time_series_length){
a["average_or_not"] <- type
a["c_t"] <- c_t
a["amount_of_time_series"] <- 0
a["time_series_length"] <- time_series_length
a["dgp"] <- "average"
a["te_intervention"] <- "average"
a["series_id"] <- with(a,
paste0(c_t, "_", amount_of_time_series, "_",
time_series_length, "_",
dgp, "_", te_intervention))
a <- as.data.frame(a)
a <- a[c("series_id", "time", "value", "c_t",
"time_series_length", "amount_of_time_series", "dgp",
"te_intervention", "average_or_not")]
return(a)
}
# example_add_average <- rbind(example, example_average)
example_add_average <- rbind(example, bind_average(example_average,
"average", "average_all", 222))
example_add_average <- rbind(example_add_average,
bind_average(example_average1,"treated",
"average_treated_60", 60))
example_add_average <- rbind(example_add_average,
bind_average(example_average2,"treated",
"average_treated_222", 222))
# Plotting
plot <- ggplot(data = example_add_average,
aes(x = time, y = value, group = series_id)) +
geom_line(aes(color = average_or_not, alpha = average_or_not)) +
scale_linewidth(range = 0.01) +
scale_color_manual(values = c(average_all = "red",
average_treated_60 = "black", average_treated_222 = "black",
individual = "darkgrey")) +
scale_alpha_manual(values = c(average_all = 1,
average_treated_60 = 1, average_treated_222 = 1,
individual = 0.02))+
labs(x = "Time", y = "Unemployment rate") +
theme(aspect.ratio=2/5)
plot
# Save the plot
# ggsave(file = paste("Figure2_a.pdf"),
# plot = plot, width = 8, height = 5, dpi = 300)
# pre-intervention
pre_intervention <- quantile(scenario_df2[(scenario_df2$time<
(scenario_df2$time_series_length-11)),
"value"], probs = seq(0, 1, 1/1000))
# post-intervention
post_intervention <- quantile(scenario_df2[(scenario_df2$c_t=="treated")&
(scenario_df2$time>=(scenario_df2$time_series_length-11)) ,
"value"], probs = seq(0, 1, 1/1000))
# true counterfactual
true_counterfactual <- quantile(scenario_df2[(scenario_df2$c_t=="control")&
(scenario_df2$time>=(scenario_df2$time_series_length-11)) ,
"value"], probs = seq(0, 1, 1/1000))
quantile_data <- data.frame(quantiles=as.numeric(gsub("%", "", names(pre_intervention))),
pre_intervention=unname(pre_intervention),
post_intervention=unname(post_intervention),
true_counterfactual=unname(true_counterfactual),
stringsAsFactors=FALSE)
quantile_data_melted <- melt(quantile_data, id.vars="quantiles", variable_name="type")
str(quantile_data_melted)
## 'data.frame': 3003 obs. of 3 variables:
## $ quantiles: num 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 ...
## $ type : Factor w/ 3 levels "pre_intervention",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 0 0 0 0 0 0 0 0 0 0 ...
quantile_plot <- ggplot(data = quantile_data_melted,
aes(x=quantiles, y=value)) +
geom_line(aes(color = type)) +
scale_color_manual(values = c(pre_intervention = "red",
post_intervention = "blue", true_counterfactual = "green"))+
geom_vline(xintercept=90, color = "purple", linetype = "dashed")+
labs(x = "Quantiles", y = "Unemployment rate") +
theme_minimal() +
theme(aspect.ratio=2/5)
quantile_plot
# Save the plot
# ggsave(file = paste("Figure2_b_1.pdf"), plot=quantile_plot,
# width = 8, height = 5, dpi = 300)
# The formular
# pre-intervention
pre_intervention_f <- quantile(scenario_df2[(scenario_df2$time<
(scenario_df2$time_series_length-11)),
"value"], probs = c(0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,
0.8,0.9,0.95,1))
# post-intervention
post_intervention_f <- quantile(scenario_df2[(scenario_df2$c_t=="treated")&
(scenario_df2$time>=(scenario_df2$time_series_length-11)) ,
"value"], probs = c(0,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,
0.8,0.9,0.95,1))
quantile_data_f <- data.frame(quantiles=names(pre_intervention_f),
pre_intervention=unname(pre_intervention_f),
post_intervention=unname(post_intervention_f),
Difference=(post_intervention_f-pre_intervention_f)/pre_intervention_f*100,
stringsAsFactors=FALSE)
quantile_data_f %>%
mutate(across(2:4, round, digits=2))
## quantiles pre_intervention post_intervention Difference
## 0% 0% 0.00 0.00 NaN
## 5% 5% 0.22 0.23 6.47
## 10% 10% 0.51 0.52 1.49
## 20% 20% 0.84 0.84 -0.19
## 30% 30% 1.76 1.63 -7.32
## 40% 40% 2.42 2.29 -5.31
## 50% 50% 2.96 2.82 -4.65
## 60% 60% 3.41 3.27 -4.02
## 70% 70% 3.78 3.66 -3.14
## 80% 80% 4.14 4.03 -2.73
## 90% 90% 4.62 4.60 -0.40
## 95% 95% 5.02 5.92 17.87
## 100% 100% 8.97 8.58 -4.39